General tasks
Introduction¶
Loading all necessary Libraries
# Import necessary libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
from sklearn.cluster import KMeans
import csv
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
Function to read txt data files and convert them to proper csv files :
- txtFile: input filename (including directory if applicable)
- csvFile: output filename (including directory if applicable)
- vtabchar: vertical tab character in the original file (to be replaced with newline command '\n')
- delim: delimiter character used in the original file (to be replaced with comma)
def txt2csv(txtFile, csvFile, vtabchar, delim):
with open(txtFile, 'r') as file:
data = file.read().replace(vtabchar, '\n').replace(delim, ',')
with open(csvFile, 'w') as file:
file.write(data)
return
Define the file paths
txt_file_path = "Einzelteil_T16.txt"
csv_file_path = "Einzelteil_T16.csv"
txt2csv(txt_file_path, csv_file_path, ' ', ' | | ')
Load all relevant Files
Einzelteil = pd.read_csv(csv_file_path, low_memory=False,
delimiter=",", quoting=csv.QUOTE_NONE, encoding='utf-8' )
# Remove quotes from the headers
Einzelteil.columns = Einzelteil.columns.str.replace('"', '')
# Remove quotes from all string entries in the DataFrame
Einzelteil = Einzelteil.replace({'"': ''}, regex=True)
Einzelteil.head()
| X1 | ID_T16.x | Produktionsdatum.x | Herstellernummer.x | Werksnummer.x | Fehlerhaft.x | Fehlerhaft_Datum.x | Fehlerhaft_Fahrleistung.x | ID_T16.y | Produktionsdatum.y | ... | Fehlerhaft.y | Fehlerhaft_Datum.y | Fehlerhaft_Fahrleistung.y | ID_T16 | Produktionsdatum | Herstellernummer | Werksnummer | Fehlerhaft | Fehlerhaft_Datum | Fehlerhaft_Fahrleistung | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| "1" | 1 | 16-212-2121-7 | 2008-11-07 | 212 | 2121.0 | 0.0 | NaN | 0.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| "2" | 2 | 16-212-2122-41 | 2008-11-08 | 212 | 2122.0 | 0.0 | NaN | 0.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| "3" | 5 | 16-212-2121-36 | 2008-11-07 | 212 | 2121.0 | 0.0 | NaN | 0.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| "4" | 10 | 16-212-2122-20 | 2008-11-07 | 212 | 2122.0 | 0.0 | NaN | 0.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| "5" | 12 | 16-212-2122-33 | 2008-11-07 | 212 | 2122.0 | 0.0 | NaN | 0.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 22 columns
Fahrzeuge_df = pd.read_csv("Fahrzeuge_OEM1_Typ11_Fehleranalyse.csv")
Fahrzeuge_df.head()
| Unnamed: 0 | X | X1 | ID_Fahrzeug | Herstellernummer | Werksnummer | Fehlerhaft_Datum | Fehlerhaft_Fahrleistung | days | fuel | engine | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | 9 | 9 | 11-1-11-9 | 1 | 11 | 2010-03-16 | 34824.319559 | 1493.150761 | 4.003670 | small |
| 1 | 11 | 11 | 11 | 11-1-11-11 | 1 | 11 | 2010-03-16 | 74217.428309 | 1044.462231 | 11.042487 | large |
| 2 | 13 | 13 | 13 | 11-1-11-13 | 1 | 11 | 2010-03-16 | 32230.699639 | 749.669810 | 3.579117 | small |
| 3 | 15 | 15 | 15 | 11-1-11-15 | 1 | 11 | 2010-03-16 | 44885.783551 | 858.688003 | 4.666801 | small |
| 4 | 37 | 37 | 37 | 11-1-11-37 | 1 | 11 | 2010-03-17 | 86348.329866 | 1478.204174 | 4.634381 | small |
komponente_k7_df = pd.read_csv("Komponente_K7.csv", delimiter=';')
komponente_k7_df.head()
| Unnamed: 0 | IDNummer | Produktionsdatum | Herstellernummer | Werksnummer | Fehlerhaft | |
|---|---|---|---|---|---|---|
| 0 | 1 | K7-114-1142-1 | 2008-11-12 | 114 | 1142 | 0 |
| 1 | 2 | K7-114-1142-2 | 2008-11-12 | 114 | 1142 | 0 |
| 2 | 3 | K7-114-1142-3 | 2008-11-13 | 114 | 1142 | 0 |
| 3 | 4 | K7-114-1142-4 | 2008-11-13 | 114 | 1142 | 0 |
| 4 | 5 | K7-114-1142-5 | 2008-11-13 | 114 | 1142 | 0 |
logistikverzug_df = pd.read_csv("Logistikverzug_K7.csv", delimiter=',',quoting=csv.QUOTE_NONE)
# Remove quotes from the headers
logistikverzug_df.columns = logistikverzug_df.columns.str.replace('"', '')
# Remove quotes from all string entries in the DataFrame
logistikverzug_df = logistikverzug_df.replace({'"': ''}, regex=True)
logistikverzug_df.head()
| IDNummer | Wareneingang | Herstellernummer | Werksnummer | Fehlerhaft | ||
|---|---|---|---|---|---|---|
| 0 | 1 | K7-113-1132-153160 | 2016-11-22 | 112 | 1132 | 0 |
| 1 | 2 | K7-113-1132-153109 | 2016-11-20 | 112 | 1132 | 0 |
| 2 | 3 | K7-113-1132-153195 | 2016-11-20 | 112 | 1132 | 0 |
| 3 | 4 | K7-113-1132-153226 | 2016-11-20 | 112 | 1132 | 0 |
| 4 | 5 | K7-113-1132-153231 | 2016-11-20 | 112 | 1132 | 0 |
# Read file with skipping bad lines
zulassungen_df = pd.read_csv("Zulassungen_alle_Fahrzeuge.csv", on_bad_lines='skip')
# Display the first few rows of the dataframe to ensure it loaded correctly
zulassungen_df.head()
| Unnamed: 0 | IDNummer | Gemeinden | Zulassung | |
|---|---|---|---|---|
| 0 | 408097 | 11-1-11-1 | DRESDEN | 01-01-2009 |
| 1 | 408098 | 11-1-11-2 | DRESDEN | 01-01-2009 |
| 2 | 1 | 12-1-12-1 | LEIPZIG | 01-01-2009 |
| 3 | 2 | 12-1-12-2 | LEIPZIG | 01-01-2009 |
| 4 | 3 | 12-1-12-3 | DORTMUND | 01-01-2009 |
Bestandteile_Fahrzeuge_OEM1_Typ12_df = pd.read_csv('Bestandteile_Fahrzeuge_OEM1_Typ12.csv',sep=';',quoting=csv.QUOTE_NONE)
# Remove quotes from the headers
Bestandteile_Fahrzeuge_OEM1_Typ12_df.columns = Bestandteile_Fahrzeuge_OEM1_Typ12_df.columns.str.replace('"', '')
# Remove quotes from all string entries in the DataFrame
Bestandteile_Fahrzeuge_OEM1_Typ12_df = Bestandteile_Fahrzeuge_OEM1_Typ12_df.replace({'"': ''}, regex=True)
Bestandteile_Fahrzeuge_OEM1_Typ12_df.head()
| ID_Karosserie | ID_Schaltung | ID_Sitze | ID_Motor | ID_Fahrzeug | ||
|---|---|---|---|---|---|---|
| 0 | 1 | K5-112-1122-1 | K3SG1-105-1051-4 | K2ST1-109-1092-1 | K1BE1-101-1011-90 | 12-1-12-1 |
| 1 | 2 | K5-112-1122-11 | K3AG1-105-1051-9 | K2ST1-109-1092-16 | K1BE1-101-1011-2 | 12-1-12-2 |
| 2 | 3 | K5-112-1122-2 | K3AG1-105-1051-23 | K2ST1-109-1092-21 | K1BE1-101-1011-8 | 12-1-12-3 |
| 3 | 4 | K5-112-1122-3 | K3AG1-106-1061-66 | K2ST1-109-1092-70 | K1BE1-101-1011-11 | 12-1-12-4 |
| 4 | 5 | K5-112-1122-4 | K3SG1-105-1051-8 | K2ST1-109-1092-78 | K1BE1-101-1011-42 | 12-1-12-5 |
# Define the columns to read and their new names for Komponente
columns_to_read = {'"ID_T16"': 'ID_T16', '"ID_K2ST2"': 'ID_Komponente'} # Original to new name mapping
columns_to_read_2 = {'ID_T16': 'ID_T16', 'ID_K2LE2': 'ID_Komponente'} # Original to new name mapping
columns_to_use = list(columns_to_read.keys()) # Specify which columns to read
columns_to_use_2 = list(columns_to_read_2.keys()) # Specify which columns to read
#load Kopnonet Files
Komponente_K2LE2_df = pd.read_csv("Bestandteile_Komponente_K2LE2.csv",usecols=columns_to_use_2,sep=';',quoting=csv.QUOTE_NONE)
Komponente_K2ST2_df = pd.read_csv("Bestandteile_Komponente_K2ST2.csv",usecols=columns_to_use,sep=';',quoting=csv.QUOTE_NONE)
# Rename the columns using the mapping
Komponente_K2LE2_df.rename(columns=columns_to_read_2, inplace=True)
Komponente_K2ST2_df.rename(columns=columns_to_read, inplace=True)
Komponente_K2LE2_df.head()
| ID_T16 | ID_Komponente | |
|---|---|---|
| 0 | 16-213-2132-44 | K2LE2-111-1111-1 |
| 1 | 16-215-2152-68 | K2LE2-111-1111-2 |
| 2 | 16-212-2121-9 | K2LE2-111-1111-3 |
| 3 | 16-212-2121-16 | K2LE2-111-1111-4 |
| 4 | 16-212-2121-19 | K2LE2-111-1111-5 |
#Merge component Files
Komponente_df = pd.concat([Komponente_K2LE2_df, Komponente_K2ST2_df], ignore_index=True)
Komponente_df.head()
| ID_T16 | ID_Komponente | |
|---|---|---|
| 0 | 16-213-2132-44 | K2LE2-111-1111-1 |
| 1 | 16-215-2152-68 | K2LE2-111-1111-2 |
| 2 | 16-212-2121-9 | K2LE2-111-1111-3 |
| 3 | 16-212-2121-16 | K2LE2-111-1111-4 |
| 4 | 16-212-2121-19 | K2LE2-111-1111-5 |
# Define the columns to read and their new names for Fahrzeuge
to_read = {'"ID_Sitze"': 'ID_Komponente', '"ID_Fahrzeug"': 'IDNummer'} # Original to new name mapping
to_use = list(to_read.keys()) # Specify which columns to read
#load Fahrzeuge Files
Fahrzeuge_OEM2_Typ21_df = pd.read_csv("Bestandteile_Fahrzeuge_OEM2_Typ21.csv" ,usecols=to_use,sep=';',quoting=csv.QUOTE_NONE)
Fahrzeuge_OEM2_Typ22_df = pd.read_csv("Bestandteile_Fahrzeuge_OEM2_Typ22.csv" ,usecols=to_use,sep=';',quoting=csv.QUOTE_NONE)
#Merge Fahrzeuge Files
Fahrzeuge_Merged_df = pd.concat([Fahrzeuge_OEM2_Typ21_df, Fahrzeuge_OEM2_Typ22_df], ignore_index=True)
# Rename the columns using the mapping
Fahrzeuge_Merged_df.rename(columns=to_read, inplace=True)
# Remove quotes from all string entries in the DataFrame
Fahrzeuge_Merged_df = Fahrzeuge_Merged_df.replace({'"': ''}, regex=True)
Fahrzeuge_Merged_df
| ID_Komponente | IDNummer | |
|---|---|---|
| 0 | K2ST2-109-1092-44 | 21-2-21-1 |
| 1 | K2ST2-109-1092-42 | 21-2-21-2 |
| 2 | K2ST2-109-1092-74 | 21-2-21-4 |
| 3 | K2ST2-109-1092-31 | 21-2-21-5 |
| 4 | K2ST2-109-1092-13 | 21-2-21-6 |
| ... | ... | ... |
| 19344 | K2ST2-109-1092-106993 | 22-2-21-9559 |
| 19345 | K2ST2-109-1092-106996 | 22-2-21-9560 |
| 19346 | K2ST2-109-1092-107164 | 22-2-21-9561 |
| 19347 | K2ST2-109-1092-107169 | 22-2-21-9562 |
| 19348 | K2ST2-109-1092-107179 | NaN |
19349 rows × 2 columns
Logistics and Product Development in the Automobile Industry¶
Use the production date (“Produktionsdatum”) from the dataset “Komponente_K7.csv” and the receiving date of incoming goods (“Wareneingang”) from “Logistikverzug_K7.csv” (logistics delay). Assume that produced goods are issued one day after the production date. For the model design in Python, create a new dataset “Logistics delay” containing the required information from both datasets.
komponente_k7_df.head()
| Unnamed: 0 | IDNummer | Produktionsdatum | Herstellernummer | Werksnummer | Fehlerhaft | |
|---|---|---|---|---|---|---|
| 0 | 1 | K7-114-1142-1 | 2008-11-12 | 114 | 1142 | 0 |
| 1 | 2 | K7-114-1142-2 | 2008-11-12 | 114 | 1142 | 0 |
| 2 | 3 | K7-114-1142-3 | 2008-11-13 | 114 | 1142 | 0 |
| 3 | 4 | K7-114-1142-4 | 2008-11-13 | 114 | 1142 | 0 |
| 4 | 5 | K7-114-1142-5 | 2008-11-13 | 114 | 1142 | 0 |
logistikverzug_df.head()
| IDNummer | Wareneingang | Herstellernummer | Werksnummer | Fehlerhaft | ||
|---|---|---|---|---|---|---|
| 0 | 1 | K7-113-1132-153160 | 2016-11-22 | 112 | 1132 | 0 |
| 1 | 2 | K7-113-1132-153109 | 2016-11-20 | 112 | 1132 | 0 |
| 2 | 3 | K7-113-1132-153195 | 2016-11-20 | 112 | 1132 | 0 |
| 3 | 4 | K7-113-1132-153226 | 2016-11-20 | 112 | 1132 | 0 |
| 4 | 5 | K7-113-1132-153231 | 2016-11-20 | 112 | 1132 | 0 |
#convert date columns to datetime
komponente_k7_df['Produktionsdatum'] = pd.to_datetime(komponente_k7_df['Produktionsdatum'])
logistikverzug_df['Wareneingang'] = pd.to_datetime(logistikverzug_df['Wareneingang'])
#assume goods are issued one day after the production date
komponente_k7_df['Issued_Products'] = komponente_k7_df['Produktionsdatum'] + pd.Timedelta(days=1)
# merge the datasets on IDNummer
merged_df = pd.merge(komponente_k7_df, logistikverzug_df, on='IDNummer')
#calculate the Logistics Delay
#The logistics delay is calculated as the difference between production date (issued date+1) and incoming date
#logistics delay = Wareneingang - Issued Products + 1 day (time since the production)
merged_df['LogisticsDelay'] = (merged_df['Wareneingang'] - merged_df['Issued_Products']).dt.days + 1
#create a new dataframe with all the important information
logistics_delay_df = merged_df[['IDNummer', 'Produktionsdatum', 'Issued_Products', 'Wareneingang', 'LogisticsDelay']]
#save the new dataframe
logistics_delay_df
| IDNummer | Produktionsdatum | Issued_Products | Wareneingang | LogisticsDelay | |
|---|---|---|---|---|---|
| 0 | K7-114-1142-1 | 2008-11-12 | 2008-11-13 | 2008-11-19 | 7 |
| 1 | K7-114-1142-2 | 2008-11-12 | 2008-11-13 | 2008-11-19 | 7 |
| 2 | K7-114-1142-3 | 2008-11-13 | 2008-11-14 | 2008-11-20 | 7 |
| 3 | K7-114-1142-4 | 2008-11-13 | 2008-11-14 | 2008-11-20 | 7 |
| 4 | K7-114-1142-5 | 2008-11-13 | 2008-11-14 | 2008-11-19 | 6 |
| ... | ... | ... | ... | ... | ... |
| 306485 | K7-113-1132-153241 | 2016-11-12 | 2016-11-13 | 2016-11-19 | 7 |
| 306486 | K7-113-1132-153242 | 2016-11-12 | 2016-11-13 | 2016-11-19 | 7 |
| 306487 | K7-113-1132-153243 | 2016-11-12 | 2016-11-13 | 2016-11-20 | 8 |
| 306488 | K7-113-1132-153244 | 2016-11-12 | 2016-11-13 | 2016-11-18 | 6 |
| 306489 | K7-113-1132-153245 | 2016-11-13 | 2016-11-14 | 2016-11-20 | 7 |
306490 rows × 5 columns
# taking a sample of the data
# we are creating a sample of the data, to use for analysis, statistical testing and visualizations
# using the sample will be easier to visualize and it reduces computational time
sample_size = 0.01
logistics_delay_sample = logistics_delay_df.sample(frac=sample_size, random_state=1)
# creating a histogram and density plot of the sample set
plt.figure(figsize=(7, 4))
sns.histplot(logistics_delay_sample['LogisticsDelay'], kde=True, bins=20)
plt.title('Distribution of Logistics Delay')
plt.xlabel('Logistics Delay (days)')
plt.ylabel('Frequency')
plt.show()
Interpretation: The histogram and density plot of the sample set show multiple peaks suggesting that the data don't follow a normal distribution
# Shapiro-Wilk Test for Normality
stat, p_value = stats.shapiro(logistics_delay_sample['LogisticsDelay'])
print(f'Shapiro-Wilk Test: Statistics={stat}, p-value={p_value}')
# Interpretation of Shapiro-Wilk Test
# H0: The logistics delay follows a normal distribution
# if p value > 0.05, we fail to reject, the data follow a normal distribution
# if p value < 0.05, we reject, the data do not follow a normal distribution
if p_value > 0.05:
print("The logistics delay follows a normal distribution (fail to reject H0).")
else:
print("The logistics delay does not follow a normal distribution (reject H0).")
Shapiro-Wilk Test: Statistics=0.8826102018356323, p-value=1.6389879037292943e-11 The logistics delay does not follow a normal distribution (reject H0).
Interpretation: Both the visual representation and the Shapiro-Wilk Test, indicate that the Logistics Delay does not follow a normal distribution. In this case further testing should be performed.
### Kolmogorov-Smirnov Test for Exponential Distribution
d_stat, d_p_value = stats.kstest(logistics_delay_sample['LogisticsDelay'], 'expon', args=(logistics_delay_sample['LogisticsDelay'].mean(), logistics_delay_sample['LogisticsDelay'].std()))
print(f'Kolmogorov-Smirnov Test: Statistics={d_stat}, p-value={d_p_value}')
# Interpretation of Kolmogorov-Smirnov Test
# H0: The logistics delay follows an exponential distribution
# if p value > 0.05, we fail to reject, the data follow an exponential distribution
# if p value < 0.05, we reject, the data do not follow an exponential distribution
if d_p_value > 0.05:
print("The logistics delay follows an exponential distribution (fail to reject H0).")
else:
print("The logistics delay does not follow an exponential distribution (reject H0).")
Kolmogorov-Smirnov Test: Statistics=0.6911764705882353, p-value=9.864356195759541e-98 The logistics delay does not follow an exponential distribution (reject H0).
Interpretation: The Kolmogorov-Smirnov Test indicate that the Logistic Delay does not follow an exponential distribution.
### Kolmogorov-Smirnov Test for Gamma Distribution
gamma_params = stats.gamma.fit(logistics_delay_sample['LogisticsDelay'])
d_stat_gamma, d_p_value_gamma = stats.kstest(logistics_delay_sample['LogisticsDelay'], 'gamma', args=gamma_params)
print(f'Kolmogorov-Smirnov Test for Gamma Distribution: Statistics={d_stat_gamma}, p-value={d_p_value_gamma}')
# Interpretation of Kolmogorov-Smirnov Test
# H0: The logistics delay follows a gamma distribution
# if p value > 0.05, we fail to reject, the data follow a gamma distribution
# if p value < 0.05, we reject, the data do not follow a gamma distribution
if d_p_value_gamma > 0.05:
print("The logistics delay follows a gamma distribution (fail to reject H0).")
else:
print("The logistics delay does not follow a gamma distribution (reject H0).")
Kolmogorov-Smirnov Test for Gamma Distribution: Statistics=0.19224848153764573, p-value=4.4486979686458775e-07 The logistics delay does not follow a gamma distribution (reject H0).
Interpretation: The Kolmogorov-Smirnov Test indicate that the Logistic Delay does not follow a gamma distribution.
### Kolmogorov-Smirnov Test for Log-Normal Distribution
lognorm_params = stats.lognorm.fit(logistics_delay_sample['LogisticsDelay'])
d_stat_lognorm, d_p_value_lognorm = stats.kstest(logistics_delay_sample['LogisticsDelay'], 'lognorm', args=lognorm_params)
print(f'Kolmogorov-Smirnov Test for Log-Normal Distribution: Statistics={d_stat_lognorm}, p-value={d_p_value_lognorm}')
# Interpretation of Kolmogorov-Smirnov Test
# H0: The logistics delay follows a Log-Normal distribution
# if p value > 0.05, we fail to reject, the data follow a Log-Normal distribution
# if p value < 0.05, we reject, the data do not follow a Log-Normal distribution
if d_p_value_lognorm > 0.05:
print("The logistics delay follows a log-normal distribution (fail to reject H0).")
else:
print("The logistics delay does not follow a log-normal distribution (reject H0).")
Kolmogorov-Smirnov Test for Log-Normal Distribution: Statistics=0.19837991802563315, p-value=1.6408074566638096e-07 The logistics delay does not follow a log-normal distribution (reject H0).
Interpretation: The Kolmogorov-Smirnov Test indicate that the Logistic Delay does not follow a log-normal distribution.
The results of our statistical tests indicate that the data do not follow a normal, exponential, gamma or log-normal distribution. The Logistics Delay data seem to have a complex distribution that is not captured by these parametric distributions.
To get more insight of the data, we can use the Kernel Density Estimation (KDE), to model the distribution without assuming any specific parametric form.
plt.figure(figsize=(7, 4))
sns.kdeplot(logistics_delay_sample['LogisticsDelay'], shade=True)
plt.title('Kernel Density Estimation of Logistics Delay (Sample)')
plt.xlabel('Logistics Delay (days)')
plt.ylabel('Density')
plt.show()
The KDE plot shows multiple peaks, reinforcing the indication that the logistics delay is multimodal. Specifically we notice peaks around 6, 7, 8 and 9. Which suggests that there may exist distinct groups (clusters) within the data.
To get a better understanding, we can use unsupervised learning, to identify patterns and structures within the data. More specifically: Cluster Analysis.
# Cluster Analysis
# reshape data for clustering
data_for_clustering = logistics_delay_sample['LogisticsDelay'].values.reshape(-1, 1)
# perform K-Means clustering with 5 clusters
kmeans = KMeans(n_clusters=5, random_state=1).fit(data_for_clustering)
logistics_delay_sample['Cluster'] = kmeans.labels_
# plot clusters
plt.figure(figsize=(8, 5))
sns.histplot(data=logistics_delay_sample, x='LogisticsDelay', hue='Cluster', kde=True, bins=25, palette='Set1')
plt.title('Logistics Delay in Clusters')
plt.xlabel('Logistics Delay (in days)')
plt.ylabel('Frequency')
plt.show()
# Describe the Clusters
cluster_0 = logistics_delay_sample[logistics_delay_sample['Cluster'] == 0]
cluster_1 = logistics_delay_sample[logistics_delay_sample['Cluster'] == 1]
cluster_2 = logistics_delay_sample[logistics_delay_sample['Cluster'] == 2]
cluster_3 = logistics_delay_sample[logistics_delay_sample['Cluster'] == 3]
cluster_4 = logistics_delay_sample[logistics_delay_sample['Cluster'] == 4]
summary_0 = cluster_0['LogisticsDelay'].describe()
summary_1 = cluster_1['LogisticsDelay'].describe()
summary_2 = cluster_2['LogisticsDelay'].describe()
summary_3 = cluster_3['LogisticsDelay'].describe()
summary_4 = cluster_4['LogisticsDelay'].describe()
print("Cluster 0", summary_0)
print("Cluster 1", summary_1)
print("Cluster 2", summary_2)
print("Cluster 3", summary_3)
print("Cluster 4", summary_4)
C:\Users\A R K A\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
Cluster 0 count 76.0 mean 7.0 std 0.0 min 7.0 25% 7.0 50% 7.0 75% 7.0 max 7.0 Name: LogisticsDelay, dtype: float64 Cluster 1 count 43.0 mean 8.0 std 0.0 min 8.0 25% 8.0 50% 8.0 75% 8.0 max 8.0 Name: LogisticsDelay, dtype: float64 Cluster 2 count 65.000000 mean 5.953846 std 0.211451 min 5.000000 25% 6.000000 50% 6.000000 75% 6.000000 max 6.000000 Name: LogisticsDelay, dtype: float64 Cluster 3 count 6.000000 mean 10.166667 std 0.408248 min 10.000000 25% 10.000000 50% 10.000000 75% 10.000000 max 11.000000 Name: LogisticsDelay, dtype: float64 Cluster 4 count 14.0 mean 9.0 std 0.0 min 9.0 25% 9.0 50% 9.0 75% 9.0 max 9.0 Name: LogisticsDelay, dtype: float64
After performing the Cluster Analysis we can confirm that the distribution is multimodal. More specifically, delays cluster around:
5 days
6 days
7 days
8 days
9 to 12 days
5 days
6 days
7 days
8 days
9 to 12 days
This indicates that the delays aren't spread evenly across a range but are instead concentrated at specific points.
b. Determine the mean logistics delay, considering weekends. Interpret this number and discuss possible alternatives.
#calculating the mean of the logistics delay dataframe using the describe() method.
logistics_delay_df['LogisticsDelay'].describe()
count 20377.000000 mean 7.084311 std 1.014112 min 4.000000 25% 6.000000 50% 7.000000 75% 8.000000 max 12.000000 Name: LogisticsDelay, dtype: float64
The mean logistics delay, considering the weekends, is calculated to be 7.08 days. This number reflects the average time it takes for goods to move through the logistics process, including the time when operations might be slower or paused over the weekend. We can see that also during the Cluster Analysis. Cluster 0, in which the delay is 7 days, has the highest count of observations.
Impact of the weekend: Including the weekends in the calculation of the logistics delay can artificially inflate the delay times. For instance, if goods are produced on a Friday, they might not move forward in the process until the following business day, Monday, adding several days to the logistics delay.
A possible alternative would be to implement or increase weekend operations. For example, some automated processes or partial shifts. This way, the impact of the non-working time during the weekend would be minimized.
Another alternative would be to ensure that goods are not produced on a Friday, that way ensures that the time it takes for goods to move through the logistics process will not be artificially inflated.
c. Visualize the distribution appropriately by displaying the histogram and density function using “plotly.” Describe how you selected the size of the bins
#create a histogram and ovelay a density function
fig = ff.create_distplot(hist_data=[logistics_delay_sample['LogisticsDelay']], group_labels=['Logistics Delay'], bin_size=1, show_hist=True, show_curve=True)
#customize the layout
fig.update_layout(
title='Distribution of Logistics Delays',
xaxis_title='Logistics Delay (days)',
bargap=0.2)
#show the plot
fig.show()
The bin size for a histogram is crucial because it affects the presentation of the data. In this particualr case, since the data is discrete - delays measured in whole days - a bin size of 1 (day) is appropriate. This way each delay day has its own bin, clearly highlighting the distribution of the delay time.
d) Describe the process for creating a decision tree to classify whether the component (K7) is defective (Fehlerhaft) or not. (Hint: Use visualizations.)
Step 1: Import Packages¶
Import packages that will be used for the creation of the tree: matplotlib.pyplot, sklearn etc
Step 2: Data Preparation¶
Merge Datasets¶
Combine Komponente_K7 - including the production details of K7 & Logistikverzug_K7 - which includes the logistics delay of K7
Handle Data¶
Examine the data, check for missing values and decide how to hanlde them (this can be either imputing mean/median or deleting rows with missing data)
Step 3 Visualize Data¶
Plot a correlation matrix to see how the different features (production date, logistics delay, Herstellernummer, Werknummer) interact with the defectiveness of K7. This can help select the most relevant features for the model.
In addition to that, a boxplot or a histogram can be used to determine the distribution of the data and get a deeper understanding.
Step 4 Define Features and Target Variable¶
Target Variable:¶
Is the componenent defective or not?
Features:¶
Determine the features that are most relevant for target variable - for example one of the primary features can be Logistics Delay, or a combination of the Logistics Delay and Herstellernummer.
Step 5: Splitting the data¶
Split the dataset into a training set and a testing set. The training set will be used to train the model and the test set to evaluate the performance of the decision tree. This split can have a balance 70% training and 30% testing.
Step 6: Building the decision tree¶
Use a decision tree classifier from the scikit-learn library. The decision tree algorithm will automatically determine the best splits in the data to classify components as defective or not.
Consider tuning hyperprarameters like: maximum depth of the tree, minimum samples per leaf.
Step 7: Evaluate the model¶
Confusion matrix: Evaluate the model using a confusion matrix to determine the number ofof true positives, false positives, true negatives, and false negatives.
Step 8: Visualize the decision tree¶
Visualize the tree. For example using plot_tree from scikit-learn
Step 9: Final model interpretation¶
Interprete the tree's structure
Data Storage in Separate Files¶
Explain why it makes sense to store the available data in separate files instead of saving everything in one large table. Name at least four benefits. The available tables represent a typical database structure. What is this structure called?
Data security:¶
When saved in separate files, sensitive data are better protected. For example, in case of data corruption or loss, the impact is limited to the affected files rather to the whole dataset.
Enhanced performance¶
Large tables can lead to slower performance and longer processing times. By creating separate files, queries can be executed faster and more efficiently, as the exposure to unecessery data is limited.
Data Integrity¶
When data are stored in smaller separate files, the complexity of managing and processing the data is reduced. This leads to fewer errors, which can help maintain accuracy and consistency.
Scalability and ease of integration.¶
As the data grows, separate files allows to scale more efficiently. New files can be added, or older ones can be removed, without having to mess with the whole system. Similarly, separate files are more easely integrated in different systems.
Easier maintenance and debugging¶
If there is an issue with the data, identifying and fixing the problem is easier when the data is organized into separate files. In addition to that, tasks such as cleaning, or updating can be performed quicker ,as one is focused on specific files instead of the whole data set.
The name of such a database structure is: Relational Database Structure¶
Parts T16 in Registered Vehicles¶
Determine how many parts T16 ended up in vehicles registered in Adelshofen
Extract the unique column names (without .x or .y)
base_columns = set(col.split('.')[0] for col in Einzelteil.columns)
Initialize an empty DataFrame to store the combined columns
combined_df = pd.DataFrame()
# Loop through each unique base column
for col in base_columns:
# Combine columns with suffix .x, .y, and the original column
if f"{col}.x" in Einzelteil.columns and f"{col}.y" in Einzelteil.columns:
combined_df[col] = Einzelteil[col].combine_first(Einzelteil[f"{col}.x"]).combine_first(Einzelteil[f"{col}.y"])
elif f"{col}.x" in Einzelteil.columns:
combined_df[col] = Einzelteil[col].combine_first(Einzelteil[f"{col}.x"])
elif f"{col}.y" in Einzelteil.columns:
combined_df[col] = Einzelteil[col].combine_first(Einzelteil[f"{col}.y"])
else:
combined_df[col] = Einzelteil[col]
combined_df.head()
| Produktionsdatum | Fehlerhaft_Datum | Herstellernummer | Fehlerhaft | Fehlerhaft_Fahrleistung | X1 | Werksnummer | ID_T16 | |
|---|---|---|---|---|---|---|---|---|
| "1" | 2008-11-07 | NaN | 212 | 0.0 | 0.0 | 1 | 2121.0 | 16-212-2121-7 |
| "2" | 2008-11-08 | NaN | 212 | 0.0 | 0.0 | 2 | 2122.0 | 16-212-2122-41 |
| "3" | 2008-11-07 | NaN | 212 | 0.0 | 0.0 | 5 | 2121.0 | 16-212-2121-36 |
| "4" | 2008-11-07 | NaN | 212 | 0.0 | 0.0 | 10 | 2122.0 | 16-212-2122-20 |
| "5" | 2008-11-07 | NaN | 212 | 0.0 | 0.0 | 12 | 2122.0 | 16-212-2122-33 |
combined_df.nunique()
Produktionsdatum 101 Fehlerhaft_Datum 81 Herstellernummer 2 Fehlerhaft 2 Fehlerhaft_Fahrleistung 54 X1 5452 Werksnummer 2 ID_T16 5452 dtype: int64
Merge all the files
# Step 1: Join combined_df and Komponente_df on ID_T16
merged_df1 = pd.merge(combined_df, Komponente_df, on='ID_T16', how='inner')
# Step 2: Join the result with Fahrzeuge_Merged_df on ID_Komponente
merged_df2 = pd.merge(merged_df1, Fahrzeuge_Merged_df, on='ID_Komponente', how='inner')
# Step 3: Join the result with zulassungen_df on IDNummer
final_merged_df = pd.merge(merged_df2, zulassungen_df, on='IDNummer', how='inner')
# Display the first few rows of the final merged DataFrame
final_merged_df.head()
| Produktionsdatum | Fehlerhaft_Datum | Herstellernummer | Fehlerhaft | Fehlerhaft_Fahrleistung | X1 | Werksnummer | ID_T16 | ID_Komponente | IDNummer | Unnamed: 0 | Gemeinden | Zulassung | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-11-07 | NaN | 212 | 0.0 | 0.0 | 14 | 2121.0 | 16-212-2121-16 | K2LE2-111-1111-4 | 22-2-21-14 | 2897628 | EISENHUETTENSTADT | 02-01-2009 |
| 1 | 2008-11-12 | NaN | 212 | 0.0 | 0.0 | 857 | 2121.0 | 16-212-2121-300 | K2LE2-111-1111-314 | 22-2-21-176 | 2898471 | BOCHUM | 23-01-2009 |
| 2 | 2008-11-14 | NaN | 212 | 0.0 | 0.0 | 861 | 2121.0 | 16-212-2121-372 | K2LE2-111-1111-424 | 22-2-21-180 | 2898475 | GELSENKIRCHEN | 23-01-2009 |
| 3 | 2008-11-18 | NaN | 212 | 0.0 | 0.0 | 1488 | 2122.0 | 16-212-2122-96140 | K2LE2-111-1111-746 | 22-2-21-296 | 2899102 | METTINGEN | 10-02-2009 |
| 4 | 2008-11-07 | NaN | 212 | 0.0 | 0.0 | 2524 | 2122.0 | 16-212-2122-13 | K2LE2-111-1111-21 | 21-2-21-29 | 2385289 | SOEMMERDA | 19-02-2009 |
Filter the DataFrame where Gemeinden is 'Adelshofen'
filtered_df = final_merged_df[final_merged_df['Gemeinden'] == 'ADELSHOFEN']
Count the number of unique ID_T16 values
unique_id_count = filtered_df['ID_T16'].nunique()
print(f"Number of unique ID_T16 where Gemeinden = 'Adelshofen': {unique_id_count}")
Number of unique ID_T16 where Gemeinden = 'Adelshofen': 0
Attributes of the Registration Table¶
Identify the data types of the attributes in the registration table “Zulassungen_aller_Fahrzeuge.” Present your answers in a table integrated into your Markdown document and describe the characteristics of the data types.
# Checking data types of the attributes in Zulassungen_alle_Fahrzeuge
data_types = zulassungen_df.dtypes
# Presenting data types in a table format
data_types_table = pd.DataFrame(data_types, columns=['Data Type'])
print(data_types_table)
Data Type Unnamed: 0 int64 IDNummer object Gemeinden object Zulassung object
# Rename the Unnamed: 0 column
zulassungen_df.rename(columns={'Unnamed: 0': 'Index'}, inplace=True)
# Convert Zulassung to datetime
zulassungen_df['Zulassung'] = pd.to_datetime(zulassungen_df['Zulassung'], format='%d-%m-%Y')
# Check the unique values and count of the "Gemeinden" column
unique_gemeinden = zulassungen_df['Gemeinden'].nunique()
total_gemeinden = zulassungen_df['Gemeinden'].count()
print(f"Total entries in 'Gemeinden': {total_gemeinden}")
print(f"Unique values in 'Gemeinden': {unique_gemeinden}")
Total entries in 'Gemeinden': 1048575 Unique values in 'Gemeinden': 5752
# If unique values are significantly less than total, converting to category is efficient
if unique_gemeinden < total_gemeinden / 2: # Arbitrary threshold; adjust as needed
zulassungen_df['Gemeinden'] = zulassungen_df['Gemeinden'].astype('category')
print("Converted 'Gemeinden' column to category type.")
else:
zulassungen_df['Gemeinden'] = zulassungen_df['Gemeinden'].astype('str')
print("Converted 'Gemeinden' column to string type.")
Converted 'Gemeinden' column to category type.
# Check the updated data types
print("Data types after converting 'Gemeinden':")
print(zulassungen_df.dtypes)
Data types after converting 'Gemeinden': Index int64 IDNummer object Gemeinden category Zulassung datetime64[ns] dtype: object
# Update the data types table after conversion
data_types_table = pd.DataFrame(zulassungen_df.dtypes, columns=['Data Type'])
# Normalize the Data Type values to lowercase
data_types_table['Data Type'] = data_types_table['Data Type'].astype(str).str.lower()
# Describe the characteristics of each data type
characteristics = {
'int64': 'Integer data type, used for numeric data.',
'float64': 'Floating point data type, used for numeric data with decimals.',
'object': 'Object data type, often used for text data or mixed data types.',
'datetime64[ns]': 'Datetime data type, used for date and time information.',
'category': 'Category data type, used for categorical data to save memory.'
}
# Adding characteristics to the table using .get() with a default value
data_types_table['Characteristics'] = data_types_table['Data Type'].apply(lambda x: characteristics.get(x, 'Unknown data type'))
print(data_types_table)
Data Type Characteristics Index int64 Integer data type, used for numeric data. IDNummer object Object data type, often used for text data or ... Gemeinden category Category data type, used for categorical data ... Zulassung datetime64[ns] Datetime data type, used for date and time inf...
# Convert the table to markdown format
from tabulate import tabulate
markdown_table = tabulate(data_types_table, headers='keys', tablefmt='pipe')
# Print the markdown table
print("\nMarkdown Table:\n")
print(markdown_table)
Markdown Table: | | Data Type | Characteristics | |:----------|:---------------|:----------------------------------------------------------------| | Index | int64 | Integer data type, used for numeric data. | | IDNummer | object | Object data type, often used for text data or mixed data types. | | Gemeinden | category | Category data type, used for categorical data to save memory. | | Zulassung | datetime64[ns] | Datetime data type, used for date and time information. |
Data Types in "Zulassungen_aller_Fahrzeuge"
Data Types and Characteristics¶
| Attribute | Data Type | Characteristics |
|---|---|---|
| Index | int64 |
Integer data type, used for numeric data. |
| IDNummer | object |
Object data type, often used for text data or mixed data types. |
| Gemeinden | category |
Category data type, used for categorical data to save memory. |
| Zulassung | datetime64[ns] |
Datetime data type, used for date and time information. |
Linear Model for Mileage¶
Create a linear model from the table “Fahrzeuge_OEM1_Typ11_Fehleranalyse” relating mileage to suitable variables. Derive recommendations for OEM1 based on this model.
# Dropping the first two columns as they contain the same values
Fahrzeuge_df = Fahrzeuge_df.drop(columns=['Unnamed: 0', 'X1'])
# Display the updated dataframe structure
Fahrzeuge_df.head()
| X | ID_Fahrzeug | Herstellernummer | Werksnummer | Fehlerhaft_Datum | Fehlerhaft_Fahrleistung | days | fuel | engine | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | 11-1-11-9 | 1 | 11 | 2010-03-16 | 34824.319559 | 1493.150761 | 4.003670 | small |
| 1 | 11 | 11-1-11-11 | 1 | 11 | 2010-03-16 | 74217.428309 | 1044.462231 | 11.042487 | large |
| 2 | 13 | 11-1-11-13 | 1 | 11 | 2010-03-16 | 32230.699639 | 749.669810 | 3.579117 | small |
| 3 | 15 | 11-1-11-15 | 1 | 11 | 2010-03-16 | 44885.783551 | 858.688003 | 4.666801 | small |
| 4 | 37 | 11-1-11-37 | 1 | 11 | 2010-03-17 | 86348.329866 | 1478.204174 | 4.634381 | small |
print(Fahrzeuge_df.dtypes)
X int64 ID_Fahrzeug object Herstellernummer int64 Werksnummer int64 Fehlerhaft_Datum object Fehlerhaft_Fahrleistung float64 days float64 fuel float64 engine object dtype: object
# Convert categorical variables into dummy/indicator variables
Fahrzeuge_df = pd.get_dummies(Fahrzeuge_df, columns=['Herstellernummer', 'Werksnummer', 'engine'], drop_first=True)
Fahrzeuge_df.head()
| X | ID_Fahrzeug | Fehlerhaft_Datum | Fehlerhaft_Fahrleistung | days | fuel | Werksnummer_12 | engine_medium | engine_small | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 9 | 11-1-11-9 | 2010-03-16 | 34824.319559 | 1493.150761 | 4.003670 | False | False | True |
| 1 | 11 | 11-1-11-11 | 2010-03-16 | 74217.428309 | 1044.462231 | 11.042487 | False | False | False |
| 2 | 13 | 11-1-11-13 | 2010-03-16 | 32230.699639 | 749.669810 | 3.579117 | False | False | True |
| 3 | 15 | 11-1-11-15 | 2010-03-16 | 44885.783551 | 858.688003 | 4.666801 | False | False | True |
| 4 | 37 | 11-1-11-37 | 2010-03-17 | 86348.329866 | 1478.204174 | 4.634381 | False | False | True |
print(Fahrzeuge_df.dtypes)
X int64 ID_Fahrzeug object Fehlerhaft_Datum object Fehlerhaft_Fahrleistung float64 days float64 fuel float64 Werksnummer_12 bool engine_medium bool engine_small bool dtype: object
# Convert Zulassung to datetime
Fahrzeuge_df['Fehlerhaft_Datum'] = pd.to_datetime(Fahrzeuge_df['Fehlerhaft_Datum'], format='mixed')
# Define the feature variables (X) and target variable (y)
X = Fahrzeuge_df.drop(columns=['Fehlerhaft_Fahrleistung', 'Fehlerhaft_Datum', 'Werksnummer_12', 'days', 'ID_Fahrzeug'])
y = Fahrzeuge_df['Fehlerhaft_Fahrleistung']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# Predict the mileage on the test set
y_pred = model.predict(X_test)
# Calculate the model's performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Output the coefficients, MSE, and R-squared value
coefficients = pd.Series(model.coef_, index=X.columns)
model_summary = {
"Mean Squared Error": mse,
"R-squared": r2,
"Coefficients": coefficients
}
model_summary
{'Mean Squared Error': 138763043.67982456,
'R-squared': 0.4760476032405211,
'Coefficients': X 0.027603
fuel 5252.134642
engine_medium 8659.265984
engine_small 11055.022434
dtype: float64}
# Calculate predictions for the training set to plot the model's fit
y_train_pred = model.predict(X_train)
# Scatter plot of actual vs predicted mileage (Training set)
plt.figure(figsize=(10, 6))
plt.scatter(y_train, y_train_pred, color='blue', alpha=0.5)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=3)
plt.xlabel('Actual Mileage (Fehlerhaft_Fahrleistung)')
plt.ylabel('Predicted Mileage (Fehlerhaft_Fahrleistung)')
plt.title('Actual vs Predicted Mileage (Training Set)')
plt.grid(True)
plt.show()
Hit and Run Accident Investigation¶
On 11.08.2010, there was a hit-and-run accident. The license plate of the car involved is unknown. The police have asked for your assistance, as you work for the Federal Motor Transport Authority, to find out where the vehicle with body part number “K5-112-1122-79” was registered.
# Find the vehicle ID corresponding to the body part number "K5-112-1122-79"
vehicle_info = Bestandteile_Fahrzeuge_OEM1_Typ12_df[Bestandteile_Fahrzeuge_OEM1_Typ12_df['ID_Karosserie'] == 'K5-112-1122-79']
# Extract the ID_Fahrzeug
vehicle_id = vehicle_info['ID_Fahrzeug'].values[0] if not vehicle_info.empty else None
vehicle_id
'12-1-12-82'
# Find the registration details using the vehicle ID
registration_info = zulassungen_df[zulassungen_df['IDNummer'] == vehicle_id]
# Extract the relevant registration details
registration_place = registration_info['Gemeinden'].values[0] if not registration_info.empty else None
registration_place
'ASCHERSLEBEN'